package com.leo.csdnspider.processor;

import com.alibaba.fastjson.JSONObject;
import com.leo.csdnspider.entity.ClassInfo;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

import java.text.SimpleDateFormat;
import java.util.List;

/**
 * @ClassName: ClassInfoProcessor
 * @Author: Leo
 * @Description: 分类信息
 * @Date: 9/3/2019 8:18 PM
 */
public class ClassInfoProcessor implements PageProcessor {
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
    private Logger logger = Logger.getLogger(ClassInfoProcessor.class);

    @Override
    public void process(Page page) {
        Html csdn = page.getHtml();

        //类别
        List<String> className = csdn.xpath("//*[@id=\"asideCategory\"]/div/ul/").xpath("/li/a/span[1]/text()").all();

        //数量
        List<String> classCount = csdn.xpath("//*[@id=\"asideCategory\"]/div/ul/").xpath("/li/a/span[2]/text()").all();

        JSONObject jsonObject = new JSONObject();
        for (int i = 0; i < className.size(); i++) {
            jsonObject.put(className.get(i), classCount.get(i));
        }

        //userID
        String urlUsername = csdn.xpath("/html/head/link[1]/@href").get().substring(22);

        //爬取时间
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        String date = sdf.format(System.currentTimeMillis());

        ClassInfo classInfo = new ClassInfo(null, urlUsername, jsonObject.toString(), date);
        page.putField("classInfo", classInfo);


        logger.info("分类信息 ----> " + classInfo);
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider spider = Spider.create(new ClassInfoProcessor());
        spider.addUrl("https://blog.csdn.net/qq_41113081");
        spider.thread(5);
        spider.setExitWhenComplete(true);
        spider.start();
        spider.stop();
    }
}
